In [1]:
import keras 
from  os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D
from keras.utils import np_utils
from keras.models import Model


Using TensorFlow backend.

数据预处理


In [2]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/citai_process'
def read_file(fname=''):
    with open(join(file_path,fname)) as fr:
        lines = fr.readlines()
    lines = [line.strip() for line in lines]
    lables = [int(line[0:1]) for line in lines]
    characters = [list(line[2:]) for line in lines]
    return characters,lables       
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print(len(train_X))
print(len(test_X))
print(len(dev_X))
print(train_X[0:2])
print(train_y[0:2])


8544
2210
1101
[['a', ' ', 's', 't', 'i', 'r', 'r', 'i', 'n', 'g', ' ', ',', ' ', 'f', 'u', 'n', 'n', 'y', ' ', 'a', 'n', 'd', ' ', 'f', 'i', 'n', 'a', 'l', 'l', 'y', ' ', 't', 'r', 'a', 'n', 's', 'p', 'o', 'r', 't', ' ', 'r', 'e', '-', 'i', 'm', 'a', 'g', 'i', 'n', 'i', 'n', 'g', ' ', 'o', 'f', ' ', 'b', 'e', 'a', 'u', 't', 'y', ' ', 'a', 'n', 'd', ' ', 't', 'h', 'e', ' ', 'b', 'e', 'a', 's', 't', ' ', 'a', 'n', 'd', ' ', '1', '9', '3', '0', 's', ' ', 'h', 'o', 'r', 'r', 'o', 'r', ' ', 'f', 'i', 'l', 'm'], ['a', 'p', 'p', 'a', 'r', 'e', 'n', 't', 'l', 'y', ' ', 'r', 'e', 'a', 's', 's', 'e', 'm', 'b', 'l', 'e', ' ', 'f', 'r', 'o', 'm', ' ', 't', 'h', 'e', ' ', 'c', 'u', 't', 't', 'i', 'n', 'g', '-', 'r', 'o', 'o', 'm', ' ', 'f', 'l', 'o', 'o', 'r', ' ', 'o', 'f', ' ', 'a', 'n', 'y', ' ', 'g', 'i', 'v', 'e', ' ', 'd', 'a', 'y', 't', 'i', 'm', 'e', ' ', 's', 'o', 'a', 'p', ' ', '.']]
[4, 1]

In [ ]:

句子长度统计信息


In [3]:
def statics_list2(arrays=[]):
    lengths = [len(i) for i in arrays]
    lengths = sorted(lengths)
    length = len(lengths)
    print('length = ',len(lengths))
    print('max = ',lengths[-1])
    print('min =',lengths[0])
    print('average = ',sum(lengths)/length)
    print('top 50% = ',lengths[int(0.5*length)])
    print('top 80% = ',lengths[int(0.8*length)])
    print('top 90% = ',lengths[int(0.9*length)])
    print('top 95% = ',lengths[int(0.95*length)])
    
statics_list2(arrays=train_X)


length =  8544
max =  279
min = 4
average =  100.29693352059925
top 50% =  96
top 80% =  144
top 90% =  170
top 95% =  190

character to index


In [4]:
def token_to_index(datas=[]):
    word_index={}
    count=1
    for data in datas:
        for list_ in data:
            for w in list_:
                if w not in word_index:
                    word_index[w] = count
                    count = count + 1
    print('leng of word_index =',len(word_index))
    for i in range(len(datas)):
        datas[i] = [[ word_index[w] for w in line ] for line in datas[i]] 
    return datas,word_index

In [5]:
X,word_index = token_to_index(datas=[train_X,dev_X])
train_X,dev_X = X
print(len(word_index))
print(word_index)


leng of word_index = 97
97
{'â': 81, 'r': 6, '5': 75, 'h': 21, 'í': 64, 'v': 27, '&': 78, '*': 77, '8': 73, 'g': 8, 'X': 91, 'è': 66, ';': 55, 'W': 71, 'D': 37, 'u': 11, '-': 18, 'à': 97, '`': 49, 'á': 79, '7': 60, '0': 25, '%': 96, 'U': 88, 'O': 82, '.': 28, ',': 9, '2': 48, 'A': 47, '9': 23, 'I': 54, 'i': 5, 'P': 53, 'j': 32, 'd': 13, 'T': 74, 's': 3, '!': 51, 'ô': 94, 'z': 42, 'F': 57, 'N': 72, 'Q': 90, 'p': 15, "'": 30, 'S': 33, 'J': 69, 'l': 14, 'K': 61, '#': 87, 'k': 31, 'q': 36, '$': 39, 'Y': 62, 'f': 10, 'a': 1, 'ç': 89, 'c': 26, 'w': 29, 'æ': 85, '\\': 58, 'm': 19, 'R': 50, 'x': 35, 'C': 44, 'e': 17, 'é': 34, '@': 40, 't': 4, 'ó': 65, 'M': 38, 'o': 16, 'V': 67, 'E': 45, 'B': 46, ' ': 2, 'Z': 70, 'H': 68, 'G': 56, 'n': 7, 'y': 12, '1': 22, '=': 83, '?': 52, '6': 63, '3': 24, '+': 93, '4': 76, 'b': 20, 'ã': 86, ':': 43, 'ü': 80, 'L': 41, 'ñ': 92, 'ï': 84, 'ö': 95, '/': 59}

In [6]:
print(train_X[0])


[1, 2, 3, 4, 5, 6, 6, 5, 7, 8, 2, 9, 2, 10, 11, 7, 7, 12, 2, 1, 7, 13, 2, 10, 5, 7, 1, 14, 14, 12, 2, 4, 6, 1, 7, 3, 15, 16, 6, 4, 2, 6, 17, 18, 5, 19, 1, 8, 5, 7, 5, 7, 8, 2, 16, 10, 2, 20, 17, 1, 11, 4, 12, 2, 1, 7, 13, 2, 4, 21, 17, 2, 20, 17, 1, 3, 4, 2, 1, 7, 13, 2, 22, 23, 24, 25, 3, 2, 21, 16, 6, 6, 16, 6, 2, 10, 5, 14, 19]

构建模型


In [10]:
max_len = 190
batch_size=32

max_features = 100
embedding_dims = 100
nb_filter = 100

nb_filter = 150
filter_length = 2
dense1_hindden = 100
nb_classes = 5

In [8]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim=max_features,
                    output_dim = embedding_dims
                   ))
model.add(Convolution1D(nb_filter = nb_filter,
                        filter_length = filter_length,
                        border_mode = 'valid',
                        activation='relu',
                        subsample_length = 1
                       ))
model.add(GlobalMaxPooling1D())
model.add(Dense(dense1_hindden))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adadelta',
              metrics=['accuracy']
             )
print('finish build')


Build model...
finish build

模型输入


In [9]:
print(type(train_y[0]))
train_y = np_utils.to_categorical(train_y, nb_classes)
dev_y = np_utils.to_categorical(dev_y, nb_classes)
train_X = sequence.pad_sequences(train_X, maxlen=max_len)
dev_X = sequence.pad_sequences(dev_X, maxlen=max_len)


<class 'int'>

In [11]:
def my_generator(X=None,y=None):
    i = 0
    max_i = int(len(X)/batch_size)
    while True:
        i = i % max_i
        x_batch = X[i*batch_size:(i+1)*batch_size]
        y_batch = y[i*batch_size:(i+1)*batch_size]
        yield (x_batch,y_batch)
        i = i + 1

训练模型


In [13]:
model.fit_generator(my_generator(train_X,train_y),samples_per_epoch = 32*267,nb_epoch=100,verbose=1,validation_data=(dev_X,dev_y))


Epoch 1/100
960/960 [==============================] - 2s - loss: 1.5399 - acc: 0.3094 - val_loss: 1.5583 - val_acc: 0.2797
Epoch 2/100
960/960 [==============================] - 1s - loss: 1.5472 - acc: 0.2917 - val_loss: 1.5453 - val_acc: 0.3224
Epoch 3/100
960/960 [==============================] - 1s - loss: 1.5430 - acc: 0.3000 - val_loss: 1.5414 - val_acc: 0.3152
Epoch 4/100
960/960 [==============================] - 1s - loss: 1.5376 - acc: 0.3115 - val_loss: 1.5477 - val_acc: 0.2897
Epoch 5/100
960/960 [==============================] - 1s - loss: 1.5315 - acc: 0.3302 - val_loss: 1.5408 - val_acc: 0.3152
Epoch 6/100
960/960 [==============================] - 1s - loss: 1.5500 - acc: 0.2885 - val_loss: 1.5453 - val_acc: 0.3261
Epoch 7/100
960/960 [==============================] - 2s - loss: 1.5153 - acc: 0.3323 - val_loss: 1.5338 - val_acc: 0.3252
Epoch 8/100
960/960 [==============================] - 1s - loss: 1.5482 - acc: 0.2969 - val_loss: 1.5338 - val_acc: 0.3243
Epoch 9/100
960/960 [==============================] - 1s - loss: 1.5448 - acc: 0.2948 - val_loss: 1.5355 - val_acc: 0.3025
Epoch 10/100
960/960 [==============================] - 1s - loss: 1.5232 - acc: 0.3333 - val_loss: 1.5341 - val_acc: 0.3233
Epoch 11/100
960/960 [==============================] - 1s - loss: 1.5237 - acc: 0.3073 - val_loss: 1.5213 - val_acc: 0.3252
Epoch 12/100
960/960 [==============================] - 1s - loss: 1.5188 - acc: 0.3083 - val_loss: 1.5344 - val_acc: 0.3034
Epoch 13/100
960/960 [==============================] - 1s - loss: 1.5196 - acc: 0.3115 - val_loss: 1.5218 - val_acc: 0.3243
Epoch 14/100
960/960 [==============================] - 1s - loss: 1.5123 - acc: 0.3240 - val_loss: 1.5194 - val_acc: 0.3379
Epoch 15/100
960/960 [==============================] - 2s - loss: 1.5313 - acc: 0.3031 - val_loss: 1.5223 - val_acc: 0.3333
Epoch 16/100
960/960 [==============================] - 1s - loss: 1.5048 - acc: 0.3375 - val_loss: 1.5193 - val_acc: 0.3342
Epoch 17/100
960/960 [==============================] - 1s - loss: 1.5146 - acc: 0.3146 - val_loss: 1.5129 - val_acc: 0.3188
Epoch 18/100
960/960 [==============================] - 1s - loss: 1.5180 - acc: 0.3187 - val_loss: 1.5151 - val_acc: 0.3261
Epoch 19/100
960/960 [==============================] - 1s - loss: 1.5090 - acc: 0.3469 - val_loss: 1.5151 - val_acc: 0.3270
Epoch 20/100
960/960 [==============================] - 1s - loss: 1.5001 - acc: 0.3240 - val_loss: 1.5061 - val_acc: 0.3297
Epoch 21/100
960/960 [==============================] - 1s - loss: 1.4966 - acc: 0.3177 - val_loss: 1.5107 - val_acc: 0.3224
Epoch 22/100
960/960 [==============================] - 2s - loss: 1.5083 - acc: 0.3219 - val_loss: 1.5074 - val_acc: 0.3297
Epoch 23/100
960/960 [==============================] - 1s - loss: 1.4942 - acc: 0.3479 - val_loss: 1.5123 - val_acc: 0.3206
Epoch 24/100
960/960 [==============================] - 1s - loss: 1.5193 - acc: 0.3115 - val_loss: 1.5201 - val_acc: 0.3170
Epoch 25/100
960/960 [==============================] - 2s - loss: 1.5013 - acc: 0.3406 - val_loss: 1.5043 - val_acc: 0.3233
Epoch 26/100
960/960 [==============================] - 1s - loss: 1.4928 - acc: 0.3396 - val_loss: 1.5013 - val_acc: 0.3252
Epoch 27/100
960/960 [==============================] - 1s - loss: 1.5135 - acc: 0.3156 - val_loss: 1.5201 - val_acc: 0.3351
Epoch 28/100
960/960 [==============================] - 2s - loss: 1.4861 - acc: 0.3583 - val_loss: 1.5057 - val_acc: 0.3215
Epoch 29/100
960/960 [==============================] - 1s - loss: 1.4866 - acc: 0.3271 - val_loss: 1.4987 - val_acc: 0.3306
Epoch 30/100
960/960 [==============================] - 1s - loss: 1.4840 - acc: 0.3417 - val_loss: 1.5065 - val_acc: 0.3261
Epoch 31/100
960/960 [==============================] - 2s - loss: 1.4878 - acc: 0.3521 - val_loss: 1.5151 - val_acc: 0.3206
Epoch 32/100
960/960 [==============================] - 1s - loss: 1.4785 - acc: 0.3667 - val_loss: 1.5043 - val_acc: 0.3197
Epoch 33/100
960/960 [==============================] - 1s - loss: 1.4874 - acc: 0.3344 - val_loss: 1.5084 - val_acc: 0.3288
Epoch 34/100
960/960 [==============================] - 1s - loss: 1.4817 - acc: 0.3448 - val_loss: 1.5326 - val_acc: 0.3215
Epoch 35/100
960/960 [==============================] - 1s - loss: 1.4788 - acc: 0.3375 - val_loss: 1.4958 - val_acc: 0.3233
Epoch 36/100
960/960 [==============================] - 1s - loss: 1.5017 - acc: 0.3271 - val_loss: 1.4929 - val_acc: 0.3370
Epoch 37/100
960/960 [==============================] - 1s - loss: 1.4720 - acc: 0.3500 - val_loss: 1.4946 - val_acc: 0.3306
Epoch 38/100
960/960 [==============================] - 1s - loss: 1.4689 - acc: 0.3521 - val_loss: 1.4965 - val_acc: 0.3460
Epoch 39/100
960/960 [==============================] - 1s - loss: 1.4677 - acc: 0.3417 - val_loss: 1.4895 - val_acc: 0.3415
Epoch 40/100
960/960 [==============================] - 1s - loss: 1.4811 - acc: 0.3271 - val_loss: 1.4922 - val_acc: 0.3415
Epoch 41/100
960/960 [==============================] - 1s - loss: 1.4605 - acc: 0.3583 - val_loss: 1.4960 - val_acc: 0.3243
Epoch 42/100
960/960 [==============================] - 2s - loss: 1.4907 - acc: 0.3385 - val_loss: 1.4952 - val_acc: 0.3388
Epoch 43/100
960/960 [==============================] - 1s - loss: 1.4636 - acc: 0.3740 - val_loss: 1.4915 - val_acc: 0.3370
Epoch 44/100
960/960 [==============================] - 1s - loss: 1.4701 - acc: 0.3448 - val_loss: 1.4957 - val_acc: 0.3351
Epoch 45/100
960/960 [==============================] - 1s - loss: 1.4970 - acc: 0.3083 - val_loss: 1.4976 - val_acc: 0.3415
Epoch 46/100
960/960 [==============================] - 1s - loss: 1.4595 - acc: 0.3542 - val_loss: 1.5006 - val_acc: 0.3324
Epoch 47/100
960/960 [==============================] - 2s - loss: 1.4546 - acc: 0.3500 - val_loss: 1.4897 - val_acc: 0.3315
Epoch 48/100
960/960 [==============================] - 1s - loss: 1.4736 - acc: 0.3438 - val_loss: 1.4906 - val_acc: 0.3333
Epoch 49/100
960/960 [==============================] - 1s - loss: 1.4659 - acc: 0.3604 - val_loss: 1.4987 - val_acc: 0.3351
Epoch 50/100
960/960 [==============================] - 1s - loss: 1.4592 - acc: 0.3594 - val_loss: 1.5035 - val_acc: 0.3215
Epoch 51/100
960/960 [==============================] - 1s - loss: 1.4835 - acc: 0.3562 - val_loss: 1.4887 - val_acc: 0.3342
Epoch 52/100
960/960 [==============================] - 2s - loss: 1.4416 - acc: 0.3740 - val_loss: 1.4961 - val_acc: 0.3433
Epoch 53/100
960/960 [==============================] - 1s - loss: 1.4620 - acc: 0.3604 - val_loss: 1.5024 - val_acc: 0.3252
Epoch 54/100
960/960 [==============================] - 1s - loss: 1.4749 - acc: 0.3313 - val_loss: 1.4919 - val_acc: 0.3415
Epoch 55/100
960/960 [==============================] - 1s - loss: 1.4540 - acc: 0.3740 - val_loss: 1.4948 - val_acc: 0.3315
Epoch 56/100
960/960 [==============================] - 1s - loss: 1.4532 - acc: 0.3406 - val_loss: 1.4827 - val_acc: 0.3351
Epoch 57/100
960/960 [==============================] - 1s - loss: 1.4550 - acc: 0.3469 - val_loss: 1.4840 - val_acc: 0.3497
Epoch 58/100
960/960 [==============================] - 1s - loss: 1.4674 - acc: 0.3635 - val_loss: 1.4864 - val_acc: 0.3370
Epoch 59/100
960/960 [==============================] - 1s - loss: 1.4657 - acc: 0.3490 - val_loss: 1.4964 - val_acc: 0.3179
Epoch 60/100
960/960 [==============================] - 1s - loss: 1.4683 - acc: 0.3594 - val_loss: 1.4975 - val_acc: 0.3324
Epoch 61/100
960/960 [==============================] - 2s - loss: 1.4423 - acc: 0.3688 - val_loss: 1.4902 - val_acc: 0.3406
Epoch 62/100
960/960 [==============================] - 1s - loss: 1.4472 - acc: 0.3500 - val_loss: 1.4792 - val_acc: 0.3324
Epoch 63/100
 32/960 [>.............................] - ETA: 1s - loss: 1.5350 - acc: 0.2500
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-13-123acb64e3c0> in <module>()
----> 1 model.fit_generator(my_generator(train_X,train_y),samples_per_epoch = 32*30,nb_epoch=100,verbose=1,validation_data=(dev_X,dev_y))

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/models.py in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, nb_worker, pickle_safe, **kwargs)
    872                                         max_q_size=max_q_size,
    873                                         nb_worker=nb_worker,
--> 874                                         pickle_safe=pickle_safe)
    875 
    876     def evaluate_generator(self, generator, val_samples, max_q_size=10, nb_worker=1, pickle_safe=False, **kwargs):

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/engine/training.py in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, nb_worker, pickle_safe)
   1441                     outs = self.train_on_batch(x, y,
   1442                                                sample_weight=sample_weight,
-> 1443                                                class_weight=class_weight)
   1444                 except:
   1445                     _stop.set()

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/engine/training.py in train_on_batch(self, x, y, sample_weight, class_weight)
   1219             ins = x + y + sample_weights
   1220         self._make_train_function()
-> 1221         outputs = self.train_function(ins)
   1222         if len(outputs) == 1:
   1223             return outputs[0]

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   1011             feed_dict[tensor] = value
   1012         session = get_session()
-> 1013         updated = session.run(self.outputs + [self.updates_op], feed_dict=feed_dict)
   1014         return updated[:len(self.outputs)]
   1015 

/home/bruce/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    708     try:
    709       result = self._run(None, fetches, feed_dict, options_ptr,
--> 710                          run_metadata_ptr)
    711       if run_metadata:
    712         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/home/bruce/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    906     if final_fetches or final_targets:
    907       results = self._do_run(handle, final_targets, final_fetches,
--> 908                              feed_dict_string, options, run_metadata)
    909     else:
    910       results = []

/home/bruce/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
    956     if handle is None:
    957       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
--> 958                            target_list, options, run_metadata)
    959     else:
    960       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/home/bruce/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
    963   def _do_call(self, fn, *args):
    964     try:
--> 965       return fn(*args)
    966     except errors.OpError as e:
    967       message = compat.as_text(e.message)

/home/bruce/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
    945         return tf_session.TF_Run(session, options,
    946                                  feed_dict, fetch_list, target_list,
--> 947                                  status, run_metadata)
    948 
    949     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

预测


In [ ]:
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])